# Load Required Packages
require(pacman)
p_load(tidyverse, corrplot, MASS, extrafont, ggforce, ggtext, cowplot, pdftools, showtext, brant, DT, nnet, foreign, reactable, plotly)
extrafont::loadfonts(device="all")
# Set Working Directory
dir <- "/Users/treypallace/Documents/Data Analyst Portfolio/awards_project/"
# Read in the Data
awards <- read_csv(paste0(dir,"awards_data.csv"))
player_data <- read_csv(paste0(dir,"player_stats.csv"))
team_data <- read_csv(paste0(dir,"team_stats.csv"))
rebounding_data <- read_csv(paste0(dir,"team_rebounding_data_22.csv"))
# Remove duplicates from awards and player_data
unique_awards <- awards %>% distinct(nbapersonid, season, .keep_all = TRUE)
unique_player_data <- player_data %>% distinct(nbapersonid, season, team, .keep_all = TRUE)
#Join player data and award data
merged<- left_join(unique_player_data, unique_awards, by = c("season", "nbapersonid"))
# Replace spaces in column names with '_'
colnames(merged) <- gsub(" ", "_", colnames(merged))
# Function to calculate average points per game
calculate_average_ppg <- function(data_frame, col_names) {
# Filters data on columns we want to calculate
filtered_data <- data_frame %>%
filter(col_names == 1)
# Sum points made
total_points <- sum(filtered_data$points)
# Sums games played
total_games <- sum(filtered_data$games)
if (total_games > 0) { # Check to make sure there is no division by 0
# Calculate average by dividing total points by total games
avg_ppg <- total_points / total_games
} else {
avg_ppg <- 0
}
return(avg_ppg)
}
# Apply function to specified columns
columns_to_apply<-c("All_NBA_First_Team", "All_NBA_Second_Team", "All_NBA_Third_Team", "all_star_game")
lapply(merged[columns_to_apply], function(col) {
calculate_average_ppg(merged, col)
})
## $All_NBA_First_Team
## [1] 25.84562
##
## $All_NBA_Second_Team
## [1] 22.89029
##
## $All_NBA_Third_Team
## [1] 20.45424
##
## $all_star_game
## [1] 21.68399
ANSWER 1:
1st Team: 25.8 points per game
2nd Team: 22.9 points per game
3rd Team: 20.5 points per game
All-Star: 21.7 points per game
# Filter data
filtered2<-merged %>%
filter((draftyear >= 2007) & ( (All_NBA_First_Team ==1) | (All_NBA_Second_Team==1) | (All_NBA_Third_Team==1)))
# Obtain the number of years between the year the player was drafted and each All NBA selection by subtracting
# the year the award was received from the year the player was drafted and adding 1
filtered2$experience<-filtered2$season - filtered2$draftyear +1
# Choose the --minimum-- value to get the years of experience for players to make their --first-- selection
min_experience <- filtered2 %>%
group_by(nbapersonid) %>%
summarize(min_experience = min(experience, na.rm = TRUE))
# Calculate the average years of experiences needed to make their first All NBA selection
mean(min_experience$min_experience)
## [1] 4.682927
ANSWER 2:
4.68 Years
data <- merged %>%
dplyr::select(
draftyear,
nbapersonid,
season,
team,
All_NBA_First_Team,
All_NBA_Second_Team,
All_NBA_Third_Team,
Most_Valuable_Player_rk,
Defensive_Player_Of_The_Year_rk,
all_star_game,
mins,
games_start
)
We don’t want to double count players that played on multiple teams in one season. We want a dataframe that can be uniquely identified by season and nbapersonid. To combine rows of players who played on multiple teams in one season, I sum the total number of minutes played and the total number of games started for each player in a season. The awards are independent of the team.
summarize_stats( ): This helper function creates two
new columns for the total minutes played (total_mins) and
total games started (total_games_start) for each player
during one season. It also creates a new teams column with
the value “multiple” inserted for players who played on more than one
team during one season
rejoin_awards( ): This helper function joins the new summarized stats with the awards data.
remove_duplicates( ):This helper function keeps distinct combinations of season and id since the join added duplicates
#------------------------------ Helper Functions---------------------------------
summarize_stats <- function(df) {
df %>%
group_by(nbapersonid, season) %>%
summarize(
teams = ifelse(n() > 1, "multiple", unique(team)),
total_mins = sum(mins),
total_games_start = sum(games_start)
) %>%
ungroup()
}
rejoin_awards <- function(df){
df %>%
left_join(data, by = c("nbapersonid", "season")) %>%
dplyr::select(-team, -mins, -games_start) # Remove previous columns
}
remove_duplicates<- function(df){
df %>%
distinct(nbapersonid, season, .keep_all = TRUE)
}
#------------------------------ Main Functions---------------------------------
handle_multiple_teams <- function() {
df_summary <-summarize_stats(data)
joined <-rejoin_awards(df_summary)
remove_duplicates(joined)
}
df3 <- handle_multiple_teams()
We want a row for each player-season combination from their rookie year up to 2021. This allows their season outcome to be classified as “Out of League” for any year after they leave the NBA.
expand_df( ): This function first creates a dataframe with a row for every possible player-season combination (each player for each season 2007-2021). Since we don’t want to consider player-season combinations from before that player joined the league, it then removes any rows for seasons before their rookie season.
expand_df <-function() {
# Create a sequence of seasons as numeric values from the minimum season to 2021
season_range <- seq(2007,2021,1)
# Create a data frame with all combinations of nbapersonid and season
expanded_df <- expand.grid(nbapersonid = unique(df3$nbapersonid), season = season_range)
# Merge the expanded_df with df3 to fill in missing values with NA
all_years<- expanded_df %>%
left_join(df3, by = c("nbapersonid", "season"))
# Find each player's rookie year (or 2007 if they were drafted before 2007)
rookie_seasons <- df3 %>%
group_by(nbapersonid) %>%
summarize(rookie_season = min(season))
# Add rookie year column
all_years_rookie <- all_years %>%
left_join(rookie_seasons, by = "nbapersonid")
# Only keep rows from rookie season - 2021
all_years_rookie %>%
filter(season >= rookie_season)
}
expanded_df <-expand_df()
Write a function that defines the season outcome for each player’s season.
is_starter( ): This helper function determines if the season outcome is qualified as “Starter”. It takes into account the adjustments needed for 2011, 2019, and 2020. A player is a “Starter” if he started in at least 41 games in the season OR if he played at least 2000 minutes in the season.
is_rotation( ): This helper function determines if the season outcome is qualified as “Rotation”. It takes into account the adjustments needed for 2011, 2019, and 2020. A player is a “Rotation” player in a season if he played at least 1000 minutes in the season.
get_season_outcome( ): This function determines the season outcome for every player-season combination. It checks the qualifications for “Elite”, “All-Star”, “Starter”, “Rotation”, “Roster”, in that order. If none of the criteria is met, the player is qualified as “Out of League”.
#--------------------------- Helper functions-----------------------------------
is_starter <- function(season, mins, games_start) {
ifelse(
season == 2011,
(mins * 82/66 >= 2000) | (games_start * 82/66 >= 41),
ifelse(
season %in% c(2019, 2020),
(mins * 82/72 >= 1000) | (games_start * 82/72 >= 41),
(mins >= 2000) | games_start >= 41
)
)
}
is_rotation <- function(season, mins) {
ifelse(
season == 2011,
mins * 82/66 >= 1000,
ifelse(
season %in% c(2019, 2020),
mins * 82/72 >= 1000,
mins >= 1000
)
)
}
#--------------------------- Main function-----------------------------------
get_season_outcome <-function(df){
df %>%
mutate(
season_outcome = case_when(
All_NBA_First_Team == 1 |
All_NBA_Second_Team == 1 |
All_NBA_Third_Team == 1 |
Most_Valuable_Player_rk == 1 |
Defensive_Player_Of_The_Year_rk == 1 ~ "Elite",
all_star_game == 1 ~ "All-Star",
is_starter(season, total_mins, total_games_start) ~ "Starter",
is_rotation(season, total_mins) ~ "Rotation",
total_mins >= 1 ~ "Roster",
TRUE
~ "Out of League"
)
)
}
season_outcomes <- get_season_outcome(expanded_df)
Write a function that defines the career outcome for each player.
list_season_outcomes( ): This helper function creates a new column containing a list of every season outcome of the player.
get_second_to_last( ): This helper function first keeps the all the season outcomes for four years after their rookie season. This is not simply removing the first four season outcomes since 2007 because some players were drafted before 2007. For example, a player drafter in 2004, could have season outcomes for 2004-2009. Therefore, this function keeps season outcomes from 2008-2009. Then the function checks to see if there are at least two remaining season outcomes in the data since we need to know the outcome of at least two seasons after his first four seasons to determine a career outcome. If there are not at least two season outcomes, they are classified as “Out-of-League”. Then the list is then ranked from lowest rank (“Out of League” = 1) to highest rank (“Elite” = 6). Finally, it returns the second to last value in the ordered list of season outcomes for each player. We want the second to last outcome since a “career outcome” represents the highest level of success that the player achieved for at least two seasons.
(Note this function cannot factor in historical data before 2007. For example, say a player was drafted in 1995 and had two Elite season outcomes from 2000-2002. However, if after 2007 they only earned season outcomes of All-Star until they left the league in 2010, their career outcome would be classified as “All-Star”. Even though they had at least two seasons after their first 4 as Elite, but this is not in the data.)
get_career_outcome( ): This function defines a career outcome for each player.
# Global Variable
rank_order <-c("Out of League", "Roster", "Rotation", "Starter", "All-Star", "Elite")
# -------------------------------- Helper functions ----------------------------
# Function to get distinct player statistic
get_player_stat <- function(data, target_var) {
data %>%
group_by(nbapersonid) %>%
distinct({{ target_var }}, .keep_all = TRUE) %>%
dplyr::select({{ target_var }})
}
draft_years <- get_player_stat(player_data, draftyear)
# Function to create the new a list of all season outcomes, ordered from lowest to highest
list_season_outcomes <- function(df) {
df %>%
group_by(nbapersonid) %>%
summarize(ordered_season_outcome = list(factor(season_outcome, levels = rank_order))) %>%
left_join(draft_years, by = "nbapersonid")
}
get_second_to_last <- function(lst, draftyear) {
if (draftyear <= 2003) {
lst <-lst # No alteration if draftyear is <= 2003 since their season after their 1st four is 2007
} else if (draftyear >= 2004 && draftyear <= 2006) {
lst <- lst[-(1:(draftyear - 2003))] # Remove seasons based on draft year (example if drafted in 2004, the season their first four seasons is 2008, so 2007 is removed)
} else {
lst <- lst[-(1:4)] # Remove the first four seasons if drafted from 2007 and beyond
}
if (length(lst) >= 2) { # Checks for players drafted in 2017 and after since they don't have enough season outcomes
lst <- sort(lst) # Sort the list in ascending order
return(lst[length(lst) - 1]) # Returns the second to last element
} else {
return(1) # If there are not at least two seasons after their rookie season, return 1 ("Out of League")
}
}
#--------------------------------- Main Function --------------------------------
get_career_outcome <- function() {
season_outcomes_clean<-season_outcomes %>%
dplyr::select(nbapersonid, season, season_outcome, draftyear)
season_outcomes_ranked <- list_season_outcomes(season_outcomes_clean)
career_ranks <- season_outcomes_ranked %>%
mutate(second_to_last = mapply(get_second_to_last, ordered_season_outcome, draftyear))
career_ranks%>%
mutate(career_outcome = rank_order[career_ranks$second_to_last])
}
career_outcomes <- get_career_outcome()
datatable(career_outcomes %>% dplyr::select(-second_to_last),
options = list(scrollX = '400px'))
# Join the career outcome data with datadraft year
career_outcomes %>%
filter(draftyear == 2010) %>%
# Group by career_outcome and count the values
group_by(career_outcome) %>%
summarize(count = n())
## # A tibble: 6 × 2
## career_outcome count
## <chr> <int>
## 1 All-Star 1
## 2 Elite 2
## 3 Out of League 43
## 4 Roster 10
## 5 Rotation 7
## 6 Starter 10
ANSWER 3:
Elite: 2 players.
All-Star: 1 players
Starter: 10 players.
Rotation: 7 players.
Roster: 10 players.
Out of League: 43 players.
The model is designed to assist in predicting NBA career outcomes for players based on their game performance statistics. The model considers NBA career outcomes on a scale of different categories, including as “Out of League,” “Roster,” “Rotation,” “Starter,” “All-Star,” and “Elite.” To train the model, historical data is used where the career outcomes of players are known, and this data is used to teach the model how different player statistics correspond to specific outcomes. Once the model has learned these patterns, it can then be applied to new or unseen data. To make predictions, our model uses a variety of player statistics. These statistics are drawn from a player’s performance throughout their career. Examples include points scored, assists, rebounds, and shooting percentages. This model can assist in assessing player potential, which can guide decisions related to contract extensions, draft picks, or trades. It helps in making more informed choices about building a competitive team. It’s important to note that the model is dynamic. As more data becomes available and player performance evolves, the model can be updated and refined to provide even more accurate predictions.
I first considered ordinal logistic regression. Ordinal logistic regression is typically used when you have a categorical outcome variable that has more than two levels. Specifically, ordinal logistic regression is used when there is a natural ordering to your outcome variable, and the distance between two outcomes is unknown. It is reasonable that an Elite outcome is superior to All-Star, and that All-Star is superior to Starter, but just how much “better” is unclear. I switched instead to a multinomial model. A multinominal model is more flexible as it has least restrictive assumptions, and it also gives you coefficients for every level. An ordinal logit would likely only fit better if the proportional odds assumption is met or really close to it. (Please see the appendix for the statistical tests and justification for not using the ordinal logistic regression model).
# List of columns to sum
sum_columns <- c('All_NBA_Defensive_First_Team', 'All_NBA_Defensive_Second_Team', 'All_NBA_First_Team',
'All_NBA_Second_Team', 'All_NBA_Third_Team', 'Bill_Russell_NBA_Finals_MVP',
'Player_Of_The_Month', 'Player_Of_The_Week', 'Rookie_Of_The_Month', 'all_star_game',
'rookie_all_star_game')
# List of columns to average
avg_columns <- c('allstar_rk', 'Defensive_Player_Of_The_Year_rk', 'Most_Improved_Player_rk',
'Most_Valuable_Player_rk', 'Rookie_Of_The_Year_rk', 'Sixth_Man_Of_The_Year_rk',
'all_nba_points_rk', 'all_rookie_points_rk', 'games', 'games_start', 'mins', 'fgm',
'fga', 'fgp', 'fgm3', 'fga3', 'fgp3', 'fgm', 'fga2', 'fgp2', 'efg', 'ftm', 'fta', 'ftp',
'off_reb', 'def_reb', 'tot_reb', 'ast', 'steals', 'blocks', 'tov', 'tot_fouls', 'points',
'PER', 'FTr', 'off_reb_pct', 'def_reb_pct', 'tot_reb_pct', 'ast_pct', 'stl_pct', 'blk_pct',
'tov_pct', 'usg', 'OWS', 'DWS', 'WS', 'OBPM', 'DBPM', 'BPM', 'VORP')
# ---------------------------- Helper Functions --------------------------------
# Function to Summarize Data
summarize_dat<-function(df, minyear, maxyear) {
df %>%
filter((draftyear>=minyear) & (draftyear<=maxyear))%>%
group_by(nbapersonid) %>%
summarize(across(all_of(sum_columns), sum, .names = "{.col}_sum"),
across(all_of(avg_columns), mean, .names = "{.col}_avg"))
}
# ------------------------------------------------------------------------------
# Summarize Data
sum_data<-summarize_dat(merged, 1991, 2015)
# Get draft pick for each player
draftpicks <- get_player_stat(player_data, draftpick)
# Add draftyear, draftpick, career_outcome to summarized
add_columns <- data.frame(career_outcomes$nbapersonid, career_outcomes$draftyear, draftpicks$draftpick, career_outcomes$second_to_last, career_outcomes$career_outcome)
# Clean column names
colnames(add_columns) <- sub("^[^.]+\\.", "", colnames(add_columns))
final_data<- left_join(sum_data, add_columns, by = "nbapersonid")
final_data <- final_data %>%
dplyr::rename(career_outcome_rank = second_to_last) %>%
dplyr::select_if(~ !all(. == 0)) %>% # remove all 0 columns
dplyr::select(-Most_Valuable_Player_rk_avg, -Rookie_Of_The_Year_rk_avg) # only had 1 non 0 value so removed
final_data[is.na(final_data)] <- 0 # replace any NAs with 0
# Choose numeric columns
numeric_columns <- sapply(final_data, is.numeric)
# Extract the names of numeric columns
numeric_column_names <- names(final_data)[numeric_columns]
# Remove nbapersonid
numeric_column_names<-numeric_column_names[-1]
# Subset the data to only include numeric data
num_data <- final_data %>%
dplyr::select(all_of(numeric_column_names))
# Calculate Correlations
correlations <- cor(num_data, use = "pairwise.complete.obs")
cor_with_career_outcome <- correlations[,"career_outcome_rank"]
correlation_data <- data.frame(Variable = names(cor_with_career_outcome), Correlation = cor_with_career_outcome)
p1 <- ggplot(correlation_data, aes(x = reorder(Variable, Correlation), y = Correlation,
text = paste("Variable:", Variable, "<br>Correlation:", Correlation))) +
geom_bar(stat = "identity", aes(fill = Correlation > 0.30)) +
geom_hline(yintercept = 0.3, linetype = "dashed", color = "#f6ab24") +
labs(y = "Variable", x = "Correlation") +
ggtitle("Correlation with Career Outcome Rank") +
scale_y_continuous(limits = c(-0.25, 1), breaks = seq(-0.25, 1, 0.25)) +
theme_minimal() +
theme(text = element_text(family="CMU Sans Serif", color = 'black'),
axis.text.y = element_text(hjust = 1, size = 10)) +
coord_flip() +
scale_fill_manual(values = c("#e62428", "#1067b4"))
# Convert ggplot to plotly
ggplotly(p1, tooltip = "text")
Variables with a correlation greater than 0.3 moved forward in the variable selection process.
correlated_vars<-c(rownames(correlation_data %>%
filter(abs(Correlation)>0.3)))
check_multi_col<-final_data %>%
dplyr::select(all_of(correlated_vars))%>%
dplyr::select(-career_outcome_rank)
library(corrplot)
corr<-cor(check_multi_col,use="pairwise.complete.obs")
high_corr_combinations <- list()
# Get the number of variables
num_vars <- ncol(corr)
# Loop through all possible pairs of variables
for (i in 1:(num_vars - 1)) {
for (j in (i + 1):num_vars) {
if (!is.na(corr[i, j]) && abs(corr[i, j]) > 0.7) {
high_corr_combinations[[length(high_corr_combinations) + 1]] <- c(rownames(corr)[i], rownames(corr)[j], corr[i,j])
}
}
}
par(family="CMU Sans Serif")
corrplot(abs(corr),method="color",
order = 'AOE',
type = 'lower',
col=colorRampPalette(c("#e62428","white","#1067b4"))(100),cl.lim=c(0,1),
tl.col = "black",
tl.cex = 0.9)
To avoid multicollinearity, the following variable were removed from
the model: VORP_avg, OWS_avg,
DWS_avg, WS_avg, tot_fouls_avg,
off_reb_avg, tot_reb_avg
,ftm_avg, fta_avg, fgm2_avg,
fga_avg, fgm3_avg, BPM_avg,
fga2_avg, tov_avg,and PER_avg
# Set a reference Level
final_data$career_outcome <- relevel(factor(final_data$career_outcome, levels = rank_order), ref = "Out of League")
# Build the model
multinom <- multinom(career_outcome ~ games_avg + games_start_avg + mins_avg + fgm_avg + fga3_avg + def_reb_avg +ast_avg + steals_avg + blocks_avg + points_avg + OBPM_avg , data = final_data)
## # weights: 78 (60 variable)
## initial value 2003.187087
## iter 10 value 1626.265092
## iter 20 value 1389.445646
## iter 30 value 1310.513004
## iter 40 value 1286.330318
## iter 50 value 1106.111217
## iter 60 value 968.547469
## iter 70 value 873.267285
## iter 80 value 870.030917
## iter 90 value 869.786560
## iter 100 value 869.783377
## final value 869.783377
## stopped after 100 iterations
# Run an empty model with no predictors
OIM <- multinom(career_outcome ~ 1, data = final_data)
## # weights: 12 (5 variable)
## initial value 2003.187087
## iter 10 value 1619.021319
## final value 1619.021255
## converged
anova(OIM,multinom)
## Likelihood ratio tests of Multinomial Models
##
## Response: career_outcome
## Model
## 1 1
## 2 games_avg + games_start_avg + mins_avg + fgm_avg + fga3_avg + def_reb_avg + ast_avg + steals_avg + blocks_avg + points_avg + OBPM_avg
## Resid. df Resid. Dev Test Df LR stat. Pr(Chi)
## 1 5585 3238.043
## 2 5530 1739.567 1 vs 2 55 1498.476 0
The chi-square test analyzes the decrease in unexplained variance from the baseline model (3238.043) to the final model (1739.567), which is a difference of 1543.47. This change is significant, which means that our final model explains a significant amount of the original variability. The likelihood ratio with a p-value = 0 tells us that the model as a whole fits significantly better than an empty (i.e., a model with no predictors).
Consideration of Many Variables: The model takes into account a wide range of player statistics, allowing it to capture nuanced and multifaceted aspects of player performance. This consideration of many variables enhances the model’s ability to make accurate predictions.
Sample Size: The dataset includes a substantial sample size of 1,118 NBA players. This large sample size provides statistical robustness and increases confidence in the model’s predictions.
Model Performance: Based on the chi-square test and likelihood ratio test, the model explains a significant amount of variability. In other words, the model can provide a reasonable representation of how the data points are spread out and how they change or vary with respect to the different variables.
Interpretability: Multinomial logistic regression provides clear parameter estimates, simplifying the identification of influential player statistics in career outcomes. It’s easier to interpret than more complex methods, like deep learning or random forests, making it an accessible tool for informed decision-making in sports analytics.
Easy Model Updates: The model can be easily updatable with new data. In dynamic environments like professional sports, player performance data is continually evolving. The model can be retrained with the latest data to adapt to changing trends and player dynamics. This flexibility allows sports analysts and decision-makers to stay current and make real-time adjustments, ensuring that the model remains relevant and effective over time.
Sample Balance: There are more players in the dataset having “Out of League” or “Roster” outcomes compared to rarer outcomes like “Elite”.There’s a higher chance that the model will perform well on the majority class but poorly on the minority class i.e. it might predict “Rotation” or “Starter” more accurately but struggle with “Elite” players.
Assumptions: Multinomial logistic regression assumes that the relationships between predictors and outcomes are linear. If the relationships are highly nonlinear, the model’s accuracy may suffer.
Data Augmentation: Expanding the dataset by collecting more player statistics over time can help increase the sample size, especially for rare career outcomes. This enhances the model’s ability to predict less common scenarios like “Elite.”
Incorporate External Factors: Consider incorporating external factors like team performance, marketability, and injury history into the model to account for additional influences on career outcomes.
Ensemble Methods: Combine multiple models, each trained on different subsets of the data or with different parameters, to improve overall prediction performance.
Advanced Modeling Techniques: Exploring more advanced machine learning techniques, such as random forests or gradient boosting, may capture complex relationships in the data that multinomial logistic regression cannot. These methods are more flexible and also less sensitive to class imbalance.
Data Splitting:
Model Training:
# library(randomForest)
#
# # 'data' is the training dataset
# model <- randomForest(career_outcome ~ ., data = data)
Hyperparameter Tuning:
Random Forest has hyperparameters that you can tune to optimize
model performance. Common hyperparameters include the number of trees
(ntree), the number of features to consider at each split
(mtry), and the maximum depth of each tree
(max.depth).
I would use techniques like cross-validation and grid search to find the best combination of hyperparameters.
# Example of hyperparameter tuning
#tuneRF(data, tunecontrol = randomForest::tuneControl(N = 10))
Feature Importance:
# # Get feature importance scores
# importance <- importance(model)
# Filter Data to players drafter in 2018 and after and compute summary statistics
predict_players_data <- summarize_dat(merged, 2018, 2021)
# Replaces NAs with 0s
predict_players_data[is.na(predict_players_data)] <- 0
colnames<-c( "nbapersonid", "Out of League" , "Roster" , "Rotation" , "Starter", "All-Star", "Elite")
# Make the Prediction
pred_2018<-cbind(predict_players_data,
predict(multinom, predict_players_data, type="p")) %>%
dplyr::select(all_of(colnames))
datatable(pred_2018, options = list(scrollX = '400px'))
# Filter Data to just these 4 players
predict_players<-player_data %>%
filter(player %in%c( 'Shai Gilgeous-Alexander', 'Zion Williamson', 'James Wiseman', 'Josh Giddey'))%>%
dplyr::select(nbapersonid, player)
# Summarize the Data
predict_players_data <-summarize_dat(merged, 1991, 2021) %>%
filter(nbapersonid %in% predict_players$nbapersonid)
predict_players_data[is.na(predict_players_data)] <- 0
colnames<-c( "nbapersonid", "Out of League" , "Roster" , "Rotation" , "Starter", "All-Star", "Elite")
# Make the Prediction
final_predict<-cbind(predict_players_data,
predict(multinom, predict_players_data, type="p")) %>%
dplyr::select(all_of(colnames))
# Convert Data to Long Format
final_predict_long <- pivot_longer(final_predict,
cols = -nbapersonid,
names_to = "career_outcome",
values_to = "value")
player_levels<-c('1628983','1629627' ,'1630164','1630581')
final_predict_long$nbapersonid <- factor(final_predict_long$nbapersonid, levels = player_levels)
# Adjust data to prepare for Sankey Diagram
prediction_results <- final_predict_long %>%
gather_set_data(1:2) %>%
mutate_at(vars(nbapersonid),
funs(factor(., levels = player_levels))) %>%
mutate_at(vars(career_outcome),
funs(factor(., levels = rev(rank_order))))
font_add_google("Roboto Condensed", family = "roboto")
ggplot(prediction_results, aes(
x = x,
id = id,
split = factor(
y,
levels = c(
"1628983",
"1629627",
"1630164",
"1630581",
"Elite",
"All-Star",
"Starter",
"Rotation",
"Roster",
"Out of League"
)
),
value = value
)) +
geom_parallel_sets(
aes(fill = nbapersonid),
alpha = 0.8,
axis.width = -0.01,
n = 100,
strength = 0.6
) +
geom_parallel_sets_axes(fill = "grey20", axis.width = -0.01) +
scale_color_manual(
values = c(
"1628983" = "#f6ab24",
"1629627" = "#18224e",
"1630164" = "#e62428",
"1630581" = "#1067b4"
)
) +
scale_fill_manual(
values = c(
"1628983" = "#f6ab24",
"1629627" = "#18224e",
"1630164" = "#e62428",
"1630581" = "#1067b4"
)
)+
annotate(
geom = "text",
x = 2.07,
y = 4.2,
label = "Elite",
size = 6,
family = "roboto"
) +
annotate(
geom = "text",
x = 2.1,
y = 2.9,
label = "All-Star",
size = 6,
family = "roboto"
) +
annotate(
geom = "text",
x = 2.1,
y = 1.8,
label = "Starter",
size = 6,
family = "roboto"
) +
annotate(
geom = "text",
x = 2.105,
y = 0.9,
label = "Rotation",
size = 6,
family = "roboto"
) +
annotate(
geom = "text",
x = 2.09,
y = 0.4,
label = "Roster",
size = 6,
family = "roboto"
) +
annotate(
geom = "text",
x = 2.16,
y = 0.05,
label = "Out of League",
size = 6,
family = "roboto"
) +
annotate(
geom = "text",
x = 0.75,
y = 4.1,
label = "Shai Gilgeous-Alexander",
size = 6,
family = "roboto"
) +
annotate(
geom = "text",
x = 0.8,
y = 2.9,
label = "Zion Williamson",
size = 6,
family = "roboto"
) +
annotate(
geom = "text",
x = 0.8,
y = 1.9,
label = "James Wiseman",
size = 6,
family = "roboto"
) +
annotate(
geom = "text",
x = 0.8,
y = 0.6,
label = "Josh Giddey",
size = 6,
family = "roboto"
) +
theme_minimal() +
theme(
legend.position = "none",
plot.background = element_rect(fill = "white", color = "transparent"),
panel.grid.major = element_blank(),
panel.grid.minor = element_blank(),
axis.text = element_blank(),
axis.title = element_blank()
) +
coord_cartesian(clip = "off") +
theme(plot.margin = unit(c(2, 5, 2,5), "lines"))
# Filter Data to players drafter in 2018 and after and compute summary statistics
player_names <- get_player_stat(player_data, player)
# Summarize Data and add pLayer names
predict_players_data <-summarize_dat(merged, 2019, 2021) %>%
left_join(player_names)
# Replaces NAs with 0s
predict_players_data[is.na(predict_players_data)] <- 0
colnames<-c( "nbapersonid", "player","Out of League" , "Roster" , "Rotation" , "Starter", "All-Star", "Elite")
# Make the Prediction
pred_2019<-cbind(predict_players_data,
predict(multinom, predict_players_data, type="p")) %>%
dplyr::select(all_of(colnames))
# Find the outcome with the highest probability for each row
pred_2019$most_likely_outcome <- names(pred_2019[, rank_order])[apply(pred_2019[, rank_order], 1, which.max)]
reactable(pred_2019)
# Filter to include rows with OKC as the offensive team and games 1-80
okc <- rebounding_data %>%
filter((team == "OKC" ) & (game_number %in% seq(1,80)))
# Find OKC's predicted offensive rebound percent is for game 81
sum(okc$offensive_rebounds)/sum(okc$off_rebound_chances) * 100
## [1] 28.86898
ANSWER 1:
28.9%
ANSWER 2:
ANSWER 3:
ologit <- polr(career_outcome ~ games_avg + games_start_avg + mins_avg + fgm_avg + fga3_avg + def_reb_avg +ast_avg + steals_avg + blocks_avg + points_avg + OBPM_avg , data = final_data, Hess = T)
summary(ologit)
## Call:
## polr(formula = career_outcome ~ games_avg + games_start_avg +
## mins_avg + fgm_avg + fga3_avg + def_reb_avg + ast_avg + steals_avg +
## blocks_avg + points_avg + OBPM_avg, data = final_data, Hess = T)
##
## Coefficients:
## Value Std. Error t value
## games_avg -0.025276 0.0111345 -2.2700
## games_start_avg 0.041733 0.0117133 3.5629
## mins_avg 0.002662 0.0008278 3.2162
## fgm_avg -0.024741 0.0070944 -3.4874
## fga3_avg -0.003523 0.0015511 -2.2712
## def_reb_avg 0.007168 0.0019981 3.5874
## ast_avg 0.003833 0.0016099 2.3811
## steals_avg -0.003366 0.0076215 -0.4416
## blocks_avg 0.008740 0.0058956 1.4825
## points_avg 0.011667 0.0028267 4.1276
## OBPM_avg 0.071420 0.0267750 2.6674
##
## Intercepts:
## Value Std. Error t value
## Out of League|Roster 1.6735 0.2070 8.0862
## Roster|Rotation 3.2073 0.2313 13.8669
## Rotation|Starter 4.7414 0.2598 18.2468
## Starter|All-Star 10.4976 0.4738 22.1557
## All-Star|Elite 12.2196 0.5500 22.2160
##
## Residual Deviance: 1861.736
## AIC: 1893.736
Brant Test assesses whether the observed deviations from our Ordinal Logistic Regression model are larger than what could be attributed to chance alone.
brant(ologit)
## ----------------------------------------------------
## Test for X2 df probability
## ----------------------------------------------------
## Omnibus 241.15 44 0
## games_avg 13.16 4 0.01
## games_start_avg 17.86 4 0
## mins_avg 16.85 4 0
## fgm_avg 18.24 4 0
## fga3_avg 15.03 4 0
## def_reb_avg 9.22 4 0.06
## ast_avg 10.87 4 0.03
## steals_avg 11.92 4 0.02
## blocks_avg 4.51 4 0.34
## points_avg 24.94 4 0
## OBPM_avg 9.7 4 0.05
## ----------------------------------------------------
##
## H0: Parallel Regression Assumption holds
Since the p-values for each variable, except blocks_avg, are less than any reasonable alpha level, we reject the null hypothesis that the parallel regression (proportional odds) assumption is met.
The Lipsitz test is a goodness of fit test for ordinal response logistic regression models.
library(generalhoslem)
lipsitz.test(ologit)
##
## Lipsitz goodness of fit test for ordinal response models
##
## data: formula: career_outcome ~ games_avg + games_start_avg + mins_avg + fgm_avg + formula: fga3_avg + def_reb_avg + ast_avg + steals_avg + blocks_avg + formula: points_avg + OBPM_avg
## LR statistic = 23.006, df = 9, p-value = 0.006183
Since the p-value of 0.006 is less than any reasonable alpha level, we reject the null hypothesis, indicating a poor correlation between predicted and observed values.